Data Loading and Inspection¶

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [3]:
data=pd.read_csv('Vehicle_Insurance (3).csv')
In [4]:
data
Out[4]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28.0 0 > 2 Years Yes 40454.0 26.0 217 1
1 2 Male 76 1 3.0 0 1-2 Year No 33536.0 26.0 183 0
2 3 Male 47 1 28.0 0 > 2 Years Yes 38294.0 26.0 27 1
3 4 Male 21 1 11.0 1 < 1 Year No 28619.0 152.0 203 0
4 5 Female 29 1 41.0 1 < 1 Year No 27496.0 152.0 39 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 381105 Male 74 1 26.0 1 1-2 Year No 30170.0 26.0 88 0
381105 381106 Male 30 1 37.0 1 < 1 Year No 40016.0 152.0 131 0
381106 381107 Male 21 1 30.0 1 < 1 Year No 35118.0 160.0 161 0
381107 381108 Female 68 1 14.0 0 > 2 Years Yes 44617.0 124.0 74 0
381108 381109 Male 46 1 29.0 0 1-2 Year No 41777.0 26.0 237 0

381109 rows × 12 columns

In [5]:
data.shape
Out[5]:
(381109, 12)
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB
In [7]:
data.count()
Out[7]:
id                      381109
Gender                  381109
Age                     381109
Driving_License         381109
Region_Code             381109
Previously_Insured      381109
Vehicle_Age             381109
Vehicle_Damage          381109
Annual_Premium          381109
Policy_Sales_Channel    381109
Vintage                 381109
Response                381109
dtype: int64
In [8]:
data.head(10)
Out[8]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28.0 0 > 2 Years Yes 40454.0 26.0 217 1
1 2 Male 76 1 3.0 0 1-2 Year No 33536.0 26.0 183 0
2 3 Male 47 1 28.0 0 > 2 Years Yes 38294.0 26.0 27 1
3 4 Male 21 1 11.0 1 < 1 Year No 28619.0 152.0 203 0
4 5 Female 29 1 41.0 1 < 1 Year No 27496.0 152.0 39 0
5 6 Female 24 1 33.0 0 < 1 Year Yes 2630.0 160.0 176 0
6 7 Male 23 1 11.0 0 < 1 Year Yes 23367.0 152.0 249 0
7 8 Female 56 1 28.0 0 1-2 Year Yes 32031.0 26.0 72 1
8 9 Female 24 1 3.0 1 < 1 Year No 27619.0 152.0 28 0
9 10 Female 32 1 6.0 1 < 1 Year No 28771.0 152.0 80 0
In [9]:
data.tail(10)
Out[9]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
381099 381100 Female 51 1 28.0 0 1-2 Year Yes 44504.0 26.0 71 0
381100 381101 Female 29 1 28.0 0 < 1 Year Yes 49007.0 124.0 137 0
381101 381102 Female 70 1 28.0 0 > 2 Years Yes 50904.0 122.0 215 0
381102 381103 Female 25 1 41.0 1 < 1 Year Yes 2630.0 152.0 102 0
381103 381104 Male 47 1 50.0 0 1-2 Year Yes 39831.0 26.0 235 0
381104 381105 Male 74 1 26.0 1 1-2 Year No 30170.0 26.0 88 0
381105 381106 Male 30 1 37.0 1 < 1 Year No 40016.0 152.0 131 0
381106 381107 Male 21 1 30.0 1 < 1 Year No 35118.0 160.0 161 0
381107 381108 Female 68 1 14.0 0 > 2 Years Yes 44617.0 124.0 74 0
381108 381109 Male 46 1 29.0 0 1-2 Year No 41777.0 26.0 237 0
In [10]:
data.columns
Out[10]:
Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')
In [11]:
data.dtypes
Out[11]:
id                        int64
Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object
In [12]:
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
In [13]:
categorical_columns
Out[13]:
['Gender', 'Vehicle_Age', 'Vehicle_Damage']
In [14]:
numerical_columns = data.select_dtypes(include=['number']).columns.tolist()
In [15]:
numerical_columns
Out[15]:
['id',
 'Age',
 'Driving_License',
 'Region_Code',
 'Previously_Insured',
 'Annual_Premium',
 'Policy_Sales_Channel',
 'Vintage',
 'Response']
In [16]:
unique_values = {col: data[col].unique() for col in data.columns}
In [17]:
unique_values
Out[17]:
{'id': array([     1,      2,      3, ..., 381107, 381108, 381109]),
 'Gender': array(['Male', 'Female'], dtype=object),
 'Age': array([44, 76, 47, 21, 29, 24, 23, 56, 32, 41, 71, 37, 25, 42, 60, 65, 49,
        34, 51, 26, 57, 79, 48, 45, 72, 30, 54, 27, 38, 22, 78, 20, 39, 62,
        58, 59, 63, 50, 67, 77, 28, 69, 52, 31, 33, 43, 36, 53, 70, 46, 55,
        40, 61, 75, 64, 35, 66, 68, 74, 73, 84, 83, 81, 80, 82, 85]),
 'Driving_License': array([1, 0]),
 'Region_Code': array([28.,  3., 11., 41., 33.,  6., 35., 50., 15., 45.,  8., 36., 30.,
        26., 16., 47., 48., 19., 39., 23., 37.,  5., 17.,  2.,  7., 29.,
        46., 27., 25., 13., 18., 20., 49., 22., 44.,  0.,  9., 31., 12.,
        34., 21., 10., 14., 38., 24., 40., 43., 32.,  4., 51., 42.,  1.,
        52.]),
 'Previously_Insured': array([0, 1]),
 'Vehicle_Age': array(['> 2 Years', '1-2 Year', '< 1 Year'], dtype=object),
 'Vehicle_Damage': array(['Yes', 'No'], dtype=object),
 'Annual_Premium': array([ 40454.,  33536.,  38294., ...,  20706., 101664.,  69845.]),
 'Policy_Sales_Channel': array([ 26., 152., 160., 124.,  14.,  13.,  30., 156., 163., 157., 122.,
         19.,  22.,  15., 154.,  16.,  52., 155.,  11., 151., 125.,  25.,
         61.,   1.,  86.,  31., 150.,  23.,  60.,  21., 121.,   3., 139.,
         12.,  29.,  55.,   7.,  47., 127., 153.,  78., 158.,  89.,  32.,
          8.,  10., 120.,  65.,   4.,  42.,  83., 136.,  24.,  18.,  56.,
         48., 106.,  54.,  93., 116.,  91.,  45.,   9., 145., 147.,  44.,
        109.,  37., 140., 107., 128., 131., 114., 118., 159., 119., 105.,
        135.,  62., 138., 129.,  88.,  92., 111., 113.,  73.,  36.,  28.,
         35.,  59.,  53., 148., 133., 108.,  64.,  39.,  94., 132.,  46.,
         81., 103.,  90.,  51.,  27., 146.,  63.,  96.,  40.,  66., 100.,
         95., 123.,  98.,  75.,  69., 130., 134.,  49.,  97.,  38.,  17.,
        110.,  80.,  71., 117.,  58.,  20.,  76., 104.,  87.,  84., 137.,
        126.,  68.,  67., 101., 115.,  57.,  82.,  79., 112.,  99.,  70.,
          2.,  34.,  33.,  74., 102., 149.,  43.,   6.,  50., 144., 143.,
         41.]),
 'Vintage': array([217, 183,  27, 203,  39, 176, 249,  72,  28,  80,  46, 289, 221,
         15,  58, 147, 256, 299, 158, 102, 116, 177, 232,  60, 180,  49,
         57, 223, 136, 222, 149, 169,  88, 253, 107, 264, 233,  45, 184,
        251, 153, 186,  71,  34,  83,  12, 246, 141, 216, 130, 282,  73,
        171, 283, 295, 165,  30, 218,  22,  36,  79,  81, 100,  63, 242,
        277,  61, 111, 167,  74, 235, 131, 243, 248, 114, 281,  62, 189,
        139, 138, 209, 254, 291,  68,  92,  52,  78, 156, 247, 275,  77,
        181, 229, 166,  16,  23,  31, 293, 219,  50, 155,  66, 260,  19,
        258, 117, 193, 204, 212, 144, 234, 206, 228, 125,  29,  18,  84,
        230,  54, 123, 101,  86,  13, 237,  85,  98,  67, 128,  95,  89,
         99, 208, 134, 135, 268, 284, 119, 226, 105, 142, 207, 272, 263,
         64,  40, 245, 163,  24, 265, 202, 259,  91, 106, 190, 162,  33,
        194, 287, 292,  69, 239, 132, 255, 152, 121, 150, 143, 198, 103,
        127, 285, 214, 151, 199,  56,  59, 215, 104, 238, 120,  21,  32,
        270, 211, 200, 197,  11, 213,  93, 113, 178,  10, 290,  94, 231,
        296,  47, 122, 271, 278, 276,  96, 240, 172, 257, 224, 173, 220,
        185,  90,  51, 205,  70, 160, 137, 168,  87, 118, 288, 126, 241,
         82, 227, 115, 164, 236, 286, 244, 108, 274, 201,  97,  25, 174,
        182, 154,  48,  20,  53,  17, 261,  41, 266,  35, 140, 269, 146,
        145,  65, 298, 133, 195,  55, 188,  75,  38,  43, 110,  37, 129,
        170, 109, 267, 279, 112, 280,  76, 191,  26, 161, 179, 175, 252,
         42, 124, 187, 148, 294,  44, 157, 192, 262, 159, 210, 250,  14,
        273, 297, 225, 196]),
 'Response': array([1, 0])}
In [18]:
data.describe()
Out[18]:
id Age Driving_License Region_Code Previously_Insured Annual_Premium Policy_Sales_Channel Vintage Response
count 381109.000000 381109.000000 381109.000000 381109.000000 381109.000000 381109.000000 381109.000000 381109.000000 381109.000000
mean 190555.000000 38.822584 0.997869 26.388807 0.458210 30564.389581 112.034295 154.347397 0.122563
std 110016.836208 15.511611 0.046110 13.229888 0.498251 17213.155057 54.203995 83.671304 0.327936
min 1.000000 20.000000 0.000000 0.000000 0.000000 2630.000000 1.000000 10.000000 0.000000
25% 95278.000000 25.000000 1.000000 15.000000 0.000000 24405.000000 29.000000 82.000000 0.000000
50% 190555.000000 36.000000 1.000000 28.000000 0.000000 31669.000000 133.000000 154.000000 0.000000
75% 285832.000000 49.000000 1.000000 35.000000 1.000000 39400.000000 152.000000 227.000000 0.000000
max 381109.000000 85.000000 1.000000 52.000000 1.000000 540165.000000 163.000000 299.000000 1.000000
In [19]:
data['Region_Code']=data['Region_Code'].astype(int)
In [20]:
data['Annual_Premium']=data['Annual_Premium'].astype(int)
In [21]:
data['Policy_Sales_Channel']=data['Policy_Sales_Channel'].astype(int)
In [22]:
data
Out[22]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28 0 > 2 Years Yes 40454 26 217 1
1 2 Male 76 1 3 0 1-2 Year No 33536 26 183 0
2 3 Male 47 1 28 0 > 2 Years Yes 38294 26 27 1
3 4 Male 21 1 11 1 < 1 Year No 28619 152 203 0
4 5 Female 29 1 41 1 < 1 Year No 27496 152 39 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 381105 Male 74 1 26 1 1-2 Year No 30170 26 88 0
381105 381106 Male 30 1 37 1 < 1 Year No 40016 152 131 0
381106 381107 Male 21 1 30 1 < 1 Year No 35118 160 161 0
381107 381108 Female 68 1 14 0 > 2 Years Yes 44617 124 74 0
381108 381109 Male 46 1 29 0 1-2 Year No 41777 26 237 0

381109 rows × 12 columns

In [23]:
data.rename(columns={'Vehicle_Age':'Vehicle_Age(years)'},inplace=True)
In [24]:
data
Out[24]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age(years) Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28 0 > 2 Years Yes 40454 26 217 1
1 2 Male 76 1 3 0 1-2 Year No 33536 26 183 0
2 3 Male 47 1 28 0 > 2 Years Yes 38294 26 27 1
3 4 Male 21 1 11 1 < 1 Year No 28619 152 203 0
4 5 Female 29 1 41 1 < 1 Year No 27496 152 39 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 381105 Male 74 1 26 1 1-2 Year No 30170 26 88 0
381105 381106 Male 30 1 37 1 < 1 Year No 40016 152 131 0
381106 381107 Male 21 1 30 1 < 1 Year No 35118 160 161 0
381107 381108 Female 68 1 14 0 > 2 Years Yes 44617 124 74 0
381108 381109 Male 46 1 29 0 1-2 Year No 41777 26 237 0

381109 rows × 12 columns

In [25]:
data['Vehicle_Age(years)']=data['Vehicle_Age(years)'].str.rstrip('Years')
In [26]:
data['Vehicle_Age(years)']=data['Vehicle_Age(years)'].str.rstrip('year')
In [27]:
data
Out[27]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age(years) Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28 0 > 2 Yes 40454 26 217 1
1 2 Male 76 1 3 0 1-2 No 33536 26 183 0
2 3 Male 47 1 28 0 > 2 Yes 38294 26 27 1
3 4 Male 21 1 11 1 < 1 No 28619 152 203 0
4 5 Female 29 1 41 1 < 1 No 27496 152 39 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 381105 Male 74 1 26 1 1-2 No 30170 26 88 0
381105 381106 Male 30 1 37 1 < 1 No 40016 152 131 0
381106 381107 Male 21 1 30 1 < 1 No 35118 160 161 0
381107 381108 Female 68 1 14 0 > 2 Yes 44617 124 74 0
381108 381109 Male 46 1 29 0 1-2 No 41777 26 237 0

381109 rows × 12 columns

In [28]:
data.duplicated().sum()
Out[28]:
0

Data Cleaning¶

Handling Null Values¶

In [31]:
data.isnull()
Out[31]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age(years) Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 False False False False False False False False False False False False
1 False False False False False False False False False False False False
2 False False False False False False False False False False False False
3 False False False False False False False False False False False False
4 False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 False False False False False False False False False False False False
381105 False False False False False False False False False False False False
381106 False False False False False False False False False False False False
381107 False False False False False False False False False False False False
381108 False False False False False False False False False False False False

381109 rows × 12 columns

In [32]:
data.isnull().sum()
Out[32]:
id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age(years)      0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64
In [33]:
data
Out[33]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age(years) Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28 0 > 2 Yes 40454 26 217 1
1 2 Male 76 1 3 0 1-2 No 33536 26 183 0
2 3 Male 47 1 28 0 > 2 Yes 38294 26 27 1
3 4 Male 21 1 11 1 < 1 No 28619 152 203 0
4 5 Female 29 1 41 1 < 1 No 27496 152 39 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 381105 Male 74 1 26 1 1-2 No 30170 26 88 0
381105 381106 Male 30 1 37 1 < 1 No 40016 152 131 0
381106 381107 Male 21 1 30 1 < 1 No 35118 160 161 0
381107 381108 Female 68 1 14 0 > 2 Yes 44617 124 74 0
381108 381109 Male 46 1 29 0 1-2 No 41777 26 237 0

381109 rows × 12 columns

Handling Outliers¶

In [35]:
# Before handling outliers
sns.boxplot(data['Annual_Premium'])
plt.xlabel('Finding Outliers in Annual_Premium')
plt.show()
In [36]:
# Function to cap outliers using IQR method
def cap_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data[column] = data[column].clip(lower=lower_bound, upper=upper_bound)
    return data
In [37]:
# Handling outliers in Annual_Premium column
data = cap_outliers(data, 'Annual_Premium')
In [38]:
# After handling outliers
sns.boxplot(data['Annual_Premium'],color='#72a3fe')
plt.xlabel('Finding Outliers in Annual_Premium')
plt.show()

Data Visualization¶

Histplot¶

In [41]:
sns.histplot(data['Age'],bins=30,kde=True,color='#DE3163')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Distribution')
plt.show()
In [42]:
sns.histplot(data['Annual_Premium'],bins=30,kde=True,color='#48A6A7')
plt.title('Distribution of Annual Premium')
plt.xlabel('Annual_Premium')
plt.ylabel('Frequency')
plt.show()
In [43]:
sns.kdeplot(data['Vintage'],shade=True,color='#3a5a40')
plt.title('KDE Plot of Vintage')
plt.xlabel('Vintage')
plt.show()
In [44]:
sns.kdeplot(data['Policy_Sales_Channel'],color='#800f2f')
plt.title('Distribution of Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.ylabel('Distribution')
plt.show()
In [45]:
sns.barplot(data=data,x='Gender',y='Age',ci=None,palette=['#A0E7E5','#B4F8C8'])
plt.show()
In [46]:
sns.histplot(data=data,x='Age',bins=range(0,51,5),hue='Previously_Insured',element='poly',palette='deep')
plt.title('Frequency Distribution of Age with respect to Previously_Insured')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
In [47]:
sns.histplot(data=data,x='Policy_Sales_Channel',bins=range(0,51,5),hue='Vehicle_Damage',element='step',palette='pastel')
plt.title('Distribution of Policy Sales Channel')
plt.xlabel('Policy_Sales_Channel')
plt.ylabel('Frequency(Count)')
plt.show()
In [48]:
sns.kdeplot(data=data,x='Policy_Sales_Channel',fill=True,hue='Previously_Insured',palette=['#2ECC71','#9B59B6'])
plt.title('KDE Plot of Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.show()
In [49]:
sns.lineplot(data=data,x='Vehicle_Age(years)',y='Policy_Sales_Channel',hue='Vehicle_Damage')
plt.title('Policy Sales Channel with respect to Age')
plt.xlabel('Age')
plt.ylabel('Policy_Sales_Channel')
plt.show()

Countplot¶

In [51]:
sns.countplot(x=data['Vehicle_Age(years)'], palette=['#f0f7e0','#d3bbdd','#bc96ca'])
plt.title('Vehicle Age Distribution')
plt.xlabel('Vehicle Age')
plt.ylabel('Count')
plt.show()
In [52]:
plt.figure(figsize=(5,5))
sns.countplot(x=data['Gender'], palette=['#deb3ad','#eb7c8f'])
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
In [53]:
sns.boxplot(data['Response'])
plt.xlabel('Finding Outliers in Response')
plt.show()
In [54]:
sns.boxplot(data['Driving_License'])
plt.xlabel('Finding Outliers in Driving_License')
plt.show()

Feature Analysis¶

Examine the relationship between features and the target variable (insurance claims)¶

Countplot¶

In [327]:
data["Response"].value_counts()
Out[327]:
Response
0    334399
1     46710
Name: count, dtype: int64
In [ ]:
# Distribution of Target Variable
In [329]:
sns.countplot(x=data['Response'],palette=['#bbc0b6','#d58469'])
plt.title('Distribution of Insurance Claims(Response)')
plt.xlabel('Response (0 = No Claim, 1 = Claim)')
plt.ylabel('Count')
plt.show()
In [ ]:
#  Vehicle Age Vs Inurance Claims(Response)
In [333]:
sns.countplot(x='Response', hue='Vehicle_Age(years)',data=data, palette="pastel")
plt.title('Impact of Gender on Insurance Claims')
plt.xlabel('Gender')
plt.ylabel('Average Response Rate')
plt.show()

Pairplot¶

In [ ]:
# showing all the feature analysis
In [55]:
sns.pairplot(data, diag_kind='kde')
plt.show()

Histplot¶

In [ ]:
# histplot of age vs response
In [146]:
sns.histplot(data, x='Age', hue='Response', kde=True, bins=30, palette='Reds')
plt.title('Age Distribution by Response')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

Scatterplot¶

In [ ]:
# scatterplot of annual_premium and vintage vs response(doubt)
In [179]:
sns.scatterplot(x=data['Annual_Premium'], y=data['Vintage'], hue=data['Response'], palette='Set1')
plt.title('Annual Premium vs Vintage (Colored by Response)')
plt.xlabel('Annual Premium')
plt.ylabel('Vintage')
plt.show()

Kdeplot¶

In [ ]:
# kde plot Annual Premium Vs Response
In [148]:
sns.kdeplot(data=data, x='Annual_Premium', hue='Response',fill=True, palette='BrBG')
plt.title('KDE Plot: Annual Premium by Response')
plt.xlabel('Annual_Premium')
plt.ylabel('Density')
plt.show()

Boxplot¶

In [ ]:
# boxplot of vintage vs response
In [149]:
sns.boxplot(x=data['Response'], y=data['Vintage'], palette=['#f0ede4','#435861'])
plt.title('Vintage Vs Response')
plt.xlabel('Insurance Claim (0 = No, 1 = Yes)')
plt.ylabel('Age')
plt.show()

Displot¶

In [ ]:
# displot of vintage vs response
In [150]:
sns.displot(data=data,x='Vintage',bins=range(0,51,5),fill=True,hue='Response',kind='hist',element='poly',palette='Set1')
plt.show()
In [ ]:
# distplot of region code vs response
In [151]:
sns.displot(data=data,x='Region_Code',bins=range(0,51,5),fill=True,hue='Response',kind='hist',element='poly',palette='Set2')
plt.show()
In [ ]:
# displot of policy sales channel and response
In [156]:
sns.displot(data=data,x='Policy_Sales_Channel',fill=True,hue='Response',kind='kde')
plt.show()

Age Distribution:¶

Analyze the age distribution within the dataset and its impact on insurance claims¶

Histplot¶

In [ ]:
# Age Distribution
In [157]:
plt.figure(figsize=(8, 6))
sns.histplot(data['Age'],bins=30,kde=True,color='#000c66')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Distribution')
plt.show()

Boxplot¶

In [ ]:
# Impact of Age on Insurance Claims
In [158]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=data['Response'], y=data['Age'], palette=['#fbe0e0'])
plt.title('Impact of Age on Insurance Claims')
plt.xlabel('Insurance Claim (0 = No, 1 = Yes)')
plt.ylabel('Age')
plt.show()

Premium Analysis:¶

Investigate the distribution of insurance premiums and their correlation with claim frequencies¶

Histplot¶

In [ ]:
# Distribution of Annual Premium
In [160]:
sns.histplot(data['Annual_Premium'],bins=30,kde=True,color='#5e4d50')
plt.title('Distribution of Annual Premium')
plt.xlabel('Annual_Premium')
plt.ylabel('Frequency')
plt.show()

Boxplot¶

In [ ]:
# Correlation of Annual Premium with claim frequencies(Response)
In [162]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=data['Response'], y=data['Annual_Premium'], palette=['#3d3019','#e3a60a'])
plt.title('Impact of Annual Premium on Insurance Claims')
plt.xlabel('Insurance Claim(Response)')
plt.ylabel('Annual Premium')
plt.show()

Claim Frequencies:¶

Explore factors contributing to higher claim frequencies¶

Countplot¶

In [ ]:
# Claim Fequency Distribution
In [163]:
plt.figure(figsize=(5,4))
sns.countplot(x='Response',data=data,palette='Set3')
plt.title('Claim Frequencies')
plt.xlabel('Insurance Claim (Response)')
plt.ylabel('Count')
plt.show()

PieChart¶

In [ ]:
# PieChart of claims by gender
In [164]:
gender_claims = data.groupby('Gender')['Response'].count()
plt.pie(gender_claims, labels=gender_claims.index, autopct='%1.1f%%', colors=['#004369', '#db1f48'], startangle=90)
plt.title('Proportion of Claims by Gender')
plt.show()

Barplot¶

In [ ]:
# Vehicle Damage vs Claim Frequency 
In [217]:
plt.figure(figsize=(6, 4))
sns.barplot(x='Vehicle_Damage', y='Response', data=data, ci=None)
plt.title('Claims by Vehicle Damage')
plt.xlabel('Vehicle Damage')
plt.ylabel('Mean Claim Response')
plt.show()

Stacked Barplot¶

In [ ]:
# stacked barplot showing the claim frequency by vehicle age(in years)
In [168]:
data_grouped = data.groupby(['Vehicle_Age(years)', 'Response']).size().unstack()
colors = ['#a47551','#e4d4c8']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title('Claim Frequency by Vehicle Age')
plt.xlabel('Vehicle Age(years)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Claim (1=Yes, 0=No)', fontsize=5)
plt.show()

Pointplot¶

In [ ]:
# pointplot showing the frequency of driving license and claim
In [169]:
sns.pointplot(x='Driving_License', y='Response', data=data,palette='coolwarm')
plt.title('Driving License and Claim Frequency')
plt.xlabel('Driving License (0 = No, 1 = Yes)')
plt.ylabel('Claim Frequency')
plt.show()

Displot¶

In [ ]:
# distribution of previously insured and inurance claim(response)
In [180]:
sns.displot(data=data,x='Previously_Insured',bins=range(0,51,5),fill=True,hue='Response',kind='hist',element='poly',palette='Set2')
plt.show()

Gender Analysis:¶

Investigate the role of gender in insurance claims¶

Stacked barplot¶

In [183]:
# stacked barplot showing the claim frequency by gender
In [185]:
data_grouped = data.groupby(['Gender', 'Response']).size().unstack()
colors = ['#6f6fa6','#f2e0d5']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title("Stacked Bar Plot of Claims by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.legend(title="Claim (1=Yes, 0=No)", fontsize=6)
plt.show()

Vehicle Age and Claims¶

Examine the impact of vehicle age on the likelihood of a claim¶

Boxplot¶

In [ ]:
# impact of vehicle age on claim
In [187]:
sns.boxplot(x='Vehicle_Age(years)', y='Response', data=data)
plt.title('Impact of Vehicle Age on Claim Likelihood')
plt.show()

Piechart¶

In [ ]:
# proportion of claims by vehicle age
In [189]:
vehicle_claims = data.groupby('Vehicle_Age(years)')['Response'].mean()
plt.pie(vehicle_claims, labels=vehicle_claims.index, autopct='%1.1f%%', colors=['#90adc6','#e9eaec','#fad02c'], startangle=90)
plt.title('Proportion of Claims by Vehicle Age')
plt.show()

Region-wise Analysis:¶

Analyze regional patterns in insurance claims¶

Barplot¶

In [ ]:
# Region-wise Analysis
In [344]:
plt.figure(figsize=(8, 5))
sns.barplot(x='Region_Code', y='Response', data=data, ci=None)
plt.title('Claims by Region')
plt.xlabel('Region Code')
plt.ylabel('Claim Response')
plt.xticks(rotation=90)
plt.show()

Policy Analysis:¶

Explore the distribution and impact of different insurance policy types¶

Histplot¶

In [ ]:
# policy wise analysis 
In [193]:
plt.figure(figsize=(8, 5))
sns.histplot(data, x='Policy_Sales_Channel', hue='Response', kde=True, bins=20, palette='OrRd')
plt.title('Histogram of Claims by Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.ylabel('Count')
plt.show()

Claim Frequency by Vehicle Damage:¶

Investigate the relationship between vehicle damage and claim frequencies¶

Stacked Barplot¶

In [ ]:
# stacked barplot showing the claim frequency by vehicle damage
In [307]:
data_grouped = data.groupby(['Vehicle_Damage', 'Response']).size().unstack()
colors = ['#51a7ad','#f7a7bf']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title('Claim Frequency by Vehicle Damage')
plt.xlabel('Vehicle Damage')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

Customer Loyalty:¶

Analyze if the number of policies held by a customer influences claim likelihood¶

Kdeplot¶

In [ ]:
# KDE Plot of Claims by Policy Sales Channel
In [254]:
plt.figure(figsize=(8, 5))
sns.kdeplot(data=data, x='Policy_Sales_Channel', hue='Response', fill=True,palette='muted')
plt.title('KDE Plot of Claims by Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.ylabel('Density')
plt.show()

Facet plot¶

In [ ]:
# vintage vs response distribution
In [256]:
sns.displot(data=data,x='Vintage',bins=range(0,51,5),kind='hist',col='Response',element='bars',hue='Response',palette='Set3')
plt.ylabel('Frequency(Count)')
plt.show()

Stacked Barplot¶

In [ ]:
# previously insured vs response
In [271]:
data_grouped = data.groupby(['Previously_Insured', 'Response']).size().unstack()
colors = ['#ffc0d3','#000000']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title('Previously Insured Vs Response')
plt.xlabel('Previously_Insured')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()
In [ ]: